library(dtwclust)
library(dtw)
df<-read.csv("../clean_data/gap_list_year.csv")
df
df <- subset(df, select = -c(X) )
df
#colnames(df)[colnames(df) == "X2000"] = "2000"
colnames(df)[colnames(df) == "X2003"] = "2003"
colnames(df)[colnames(df) == "X2005"] = "2005"
colnames(df)[colnames(df) == "X2007"] = "2007"
colnames(df)[colnames(df) == "X2009"] = "2009"
colnames(df)[colnames(df) == "X2011"] = "2011"
colnames(df)[colnames(df) == "X2013"] = "2013"
colnames(df)[colnames(df) == "X2015"] = "2015"
colnames(df)[colnames(df) == "X2017"] = "2017"
colnames(df)[colnames(df) == "X2019"] = "2019"
colnames(df)[colnames(df) == "X2022"] = "2022"
df
jurisdiction = df[['Jurisdiction']]
dtw_df <- df[, -1]
dtw_df
df_lst <- tslist(dtw_df)
remove_nan <- function(ts) {
ts[!is.na(ts)]
}
# Apply the function to each time series in the list
df_lst <- lapply(df_lst, remove_nan)
df_lst
$`1`
[1] 2 -3 2 -1 0 0 -1 3 -3 1
$`2`
[1] 2 2 -1 0 -1 1 -1 0 5 3
$`3`
[1] 0 0 3 3 6 1 0 5 -2 2
$`4`
[1] -2 -3 0 -2 2 -1 0 1 -3 2
$`5`
[1] 2 1 0 4 0 2 -1 0 -2 1
$`6`
[1] 1 0 1 3 -1 -1 3 0 1 2
$`7`
[1] 2 0 -1 -1 2 0 1 -1 -1 3
$`8`
[1] 2 4 4 1 -2 3 0 0 -1 2
$`9`
[1] 4 4 1 3 1 2 -1 4 1 2
$`10`
[1] 1 1 1 -1 1 0 0 2 -1 3
$`11`
[1] -1 -1 -3 -5 -2 -4 -5 -4 -1 0
$`12`
[1] 2 -2 3 2 1 1 0 2 4 2
$`13`
[1] 2 3 3 4 0 0 0 5 1 5
$`14`
[1] 2 3 2 3 0 1 1 -1 1 1
$`15`
[1] 2 -1 3 1 2 2 2 -1 0 2
$`16`
[1] 0 2 2 3 2 1 0 5 2 2
$`17`
[1] 1 2 3 3 1 1 -2 -4 -4 0
$`18`
[1] 1 -1 1 -1 -2 0 -2 1 -3 1
$`19`
[1] 2 2 3 4 -1 3 1 0 1 3
$`20`
[1] 3 0 3 3 2 -1 -3 2 0 -1
$`21`
[1] 5 -1 4 2 1 1 -4 -2 -3 1
$`22`
[1] 1 4 3 3 3 -1 3 1 -1 4
$`23`
[1] -3 2 0 3 0 1 0 0 -3 2
$`24`
[1] 2 1 2 0 -5 0 -4 0 -2 1
$`25`
[1] 2 3 3 2 2 0 3 1 0 4
$`26`
[1] 0 -1 0 1 0 0 3 -1 -2 1
$`27`
[1] 1 2 2 2 1 1 0 1 0 2
$`28`
[1] 3 2 3 3 2 0 1 2 -2 3
$`29`
[1] 0 1 1 2 2 2 0 1 -1 3
$`30`
[1] 1 1 1 1 0 -1 2 2 2 5
$`31`
[1] 1 4 2 5 0 -2 1 2 0 2
$`32`
[1] 1 2 1 2 1 0 2 2 1 2
$`33`
[1] 2 0 1 1 -1 -3 0 -1 1 2
$`34`
[1] -1 -1 2 0 -2 -1 -1 -1 -5 -1
$`35`
[1] 0 0 3 3 2 1 0 0 -1 3
$`36`
[1] 2 2 3 3 2 1 -2 0 -2 0
$`37`
[1] 0 1 4 4 2 -1 1 -1 1 2
$`38`
[1] 2 3 2 4 5 3 -2 2 -1 4
$`39`
[1] 3 4 6 3 2 1 -2 -3 -2 4
$`40`
[1] 2 -1 1 0 0 0 -3 1 -1 2
$`41`
[1] 6 1 -1 1 -2 2 -2 -1 -1 0
$`42`
[1] 2 0 3 3 1 1 -2 -1 -3 1
$`43`
[1] 0 -1 6 0 4 0 0 -1 -2 -2
$`44`
[1] 2 4 2 1 1 1 -1 1 -1 5
$`45`
[1] 2 2 2 2 4 1 2 2 2 7
$`46`
[1] 0 0 2 2 0 -1 3 -1 -1 -1
$`47`
[1] 3 2 3 2 0 -1 2 0 -1 2
$`48`
[1] 1 0 0 2 0 1 2 3 0 7
$`49`
[1] 0 -2 2 1 2 0 1 0 -1 1
$`50`
[1] 0 1 3 2 3 0 0 2 -1 4
$`51`
[1] 1 2 2 4 5 3 -2 0 -3 1
df_cvi <- list()
for (i in 2:10){
df_clust <- tsclust(df_lst, type = "partitional", k = i, distance = "dtw_basic", centroid = "pam")
df_metric <- cvi(df_clust, type = "valid", log.base = 10)
df_cvi <- append(df_cvi, list(df_metric))
}
df_cvi_ma <- do.call(rbind, df_cvi)
rw <- c("K2","K3","K4","K5","K6","K7","K8","K9","K10")
rownames(df_cvi_ma) <- rw
print(df_cvi_ma)
Sil SF CH DB DBstar D COP
K2 0.173608338 5.754619e-11 18.629393 1.850840 1.850840 0.17021277 0.5083717
K3 0.084548205 1.332268e-15 12.539101 1.206965 1.255541 0.20000000 0.4464816
K4 0.016543586 0.000000e+00 9.156897 2.550758 2.860042 0.11363636 0.4446285
K5 0.072641023 0.000000e+00 9.690355 1.448202 1.482701 0.06818182 0.4591240
K6 0.036634943 0.000000e+00 6.100000 2.018347 2.206632 0.12820513 0.3933934
K7 -0.002914961 0.000000e+00 8.727605 1.896195 2.312031 0.06818182 0.4087914
K8 0.038724967 0.000000e+00 5.381346 2.631139 3.220004 0.09677419 0.3156996
K9 0.056284676 0.000000e+00 6.053341 1.413101 1.646940 0.16666667 0.3210506
K10 -0.038017521 0.000000e+00 4.723548 1.567478 1.846030 0.17948718 0.3324643
– “Sil” (!): Silhouette index (Rousseeuw (1987); to be maximized).-K4 – “SF” (~): Score Function (Saitta et al. (2007); to be maximized; see notes). – “CH” (~): Calinski-Harabasz index (Arbelaitz et al. (2013); to be maximized).-k3 – “DB” (?): Davies-Bouldin index (Arbelaitz et al. (2013); to be minimized).k4 – “DBstar” (?): Modified Davies-Bouldin index (DB*) (Kim and Ramakrishna (2005); to be minimized). -k4 – “D” (!): Dunn index (Arbelaitz et al. (2013); to be maximized). k5 – “COP” (!): COP index (Arbelaitz et al. (2013); to be minimized). k9
#k2
df_clust_opt_2 <- tsclust(df_lst, type = "partitional", k = 2, distance = "dtw", centroid = "pam",seed = 725)
plot(df_clust_opt_2)
#k3
df_clust_opt <- tsclust(df_lst, type = "partitional", k = 3, distance = "dtw", centroid = "pam",seed = 725)
plot(df_clust_opt)
#k4
df_clust_opt2 <- tsclust(df_lst, type = "partitional", k = 4, distance = "dtw", centroid = "pam",seed = 725)
plot(df_clust_opt2)
#k5
df_clust_opt3 <- tsclust(df_lst, type = "partitional", k = 5, distance = "dtw", centroid = "pam",seed = 725)
plot(df_clust_opt3)
#k9
df_clust_opt4 <- tsclust(df_lst, type = "partitional", k = 9, distance = "dtw", centroid = "pam",seed = 725)
plot(df_clust_opt4)
# Extract cluster assignments
cluster_assignments <- df_clust_opt2@cluster
# Determine the number of clusters
num_clusters <- max(cluster_assignments)
# Loop through each cluster and print the jurisdictions in it
for (cluster_number in 1:num_clusters) {
cat("Jurisdictions in Cluster", cluster_number, ":\n")
# Find the indices of jurisdictions in this cluster
indices_in_cluster <- which(cluster_assignments == cluster_number)
# Print the jurisdictions corresponding to these indices
print(jurisdiction[indices_in_cluster])
cat("\n") # Add a newline for readability
}
Jurisdictions in Cluster 1 :
[1] "Alaska" "California" "Connecticut" "Florida"
[5] "Illinois" "Indiana" "Kentucky" "Maine"
[9] "Massachusetts" "Michigan" "Missouri" "Nebraska"
[13] "New York" "Ohio" "Oklahoma" "Oregon"
[17] "Pennsylvania" "South Dakota" "Texas" "Wisconsin"
[21] "Wyoming"
Jurisdictions in Cluster 2 :
[1] "Alabama" "Arizona" "Arkansas" "Colorado"
[5] "Louisiana" "Maryland" "Montana" "South Carolina"
[9] "Tennessee" "Vermont"
Jurisdictions in Cluster 3 :
[1] "Delaware" "Georgia" "Idaho" "Iowa"
[5] "Kansas" "Minnesota" "Mississippi" "National"
[9] "Nevada" "New Hampshire" "New Jersey" "New Mexico"
[13] "North Dakota" "Rhode Island" "Utah" "Virginia"
[17] "Washington" "West Virginia"
Jurisdictions in Cluster 4 :
[1] "Hawaii" "North Carolina"
# Load necessary libraries
library(ggplot2)
library(reshape2)
# Loop through each cluster
for (cluster_number in 1:num_clusters) {
cat("Plotting jurisdictions in Cluster", cluster_number, ":\n")
# Find the indices of jurisdictions in this cluster
indices_in_cluster <- which(cluster_assignments == cluster_number)
# Get the names of the jurisdictions in this cluster
jurisdictions_in_cluster <- jurisdiction[indices_in_cluster]
# Filter the gap_list_year data frame for these jurisdictions
cluster_data <- df[df$Jurisdiction %in% jurisdictions_in_cluster, ]
# Convert the data to long format for ggplot
long_df <- melt(cluster_data, id.vars = "Jurisdiction", variable.name = "Year", value.name = "Value")
# Plot
p <- ggplot(long_df, aes(x = Year, y = Value, group = Jurisdiction, color = Jurisdiction)) +
geom_line() +
labs(title = paste("Cluster", cluster_number), x = "Year", y = "Gap") +
theme(legend.position = "right")
print(p)
ggsave(paste("cluster_", cluster_number, ".png", sep=""), plot = p)
}
Plotting jurisdictions in Cluster 1 :
Plotting jurisdictions in Cluster 2 :
Plotting jurisdictions in Cluster 3 :
Plotting jurisdictions in Cluster 4 :